#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
#          Simple CUDA MPS test
#
# Output:  "TEST: #.#" followed by "SUCCESS" if test was successful, OR
#          "FAILURE: ..." otherwise with an explanation of the failure, OR
#          anything else indicates a failure mode that must be investigated.
############################################################################
# Copyright (C) 2019 SchedMD LLC
# Written by Morris Jette
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
############################################################################
source ./globals

set test_id     "40.8"
set exit_code   0
set file_in     "test$test_id.input"
set file_out    "test$test_id.output"
set file_prog   "test$test_id.prog"
set job_id      0

print_header $test_id

if {[test_cons_tres]} {
        send_user "\nValid configuration, using select/cons_tres\n"
} else {
        send_user "\nWARNING: This test is only compatible with select/cons_tres\n"
        exit 0
}

set mps_cnt [get_mps_count 1]
if {$mps_cnt < 0} {
	send_user "\nFAILURE: Error getting MPS count\n"
	exit 1
}
if {$mps_cnt < 1} {
	send_user "\nWARNING: This test requires 1 or more MPS in the default partition\n"
	exit 0
}
if {$mps_cnt < 100} {
#	This could be a legitimate configuration for testing purposes, but not likley for production use
	send_user "\nFAILURE: MPS per node is unexpectedly low ($mps_cnt < 100). Check your configuration\n"
	set exit_code 1
}
send_user "\nMPS count per node is $mps_cnt\n"

#
# Build input script file
#
exec $bin_rm -f $file_in $file_out $file_prog
make_bash_script $file_in "
env | grep CUDA_VISIBLE_DEVICES
env | grep CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
unset CUDA_VISIBLE_DEVICES
ls $nvcc
$nvcc ${file_prog}.cu -g -o $file_prog
./$file_prog &
./$file_prog &
wait"

#
# Spawn a batch job to build and run CUDA job
#
spawn $sbatch --output=$file_out -N1 --gres=mps:10 -t1 ./$file_in
expect {
	-re "Submitted batch job ($number)" {
		set job_id $expect_out(1,string)
		exp_continue
	}
	timeout {
		send_user "\nFAILURE: sbatch not responding\n"
		set exit_code 1
	}
	eof {
		wait
	}
}
if {$job_id == 0} {
	send_user "\nFAILURE: batch not submitted\n"
	exit 1
}

#
# Wait for job to complete and check for file
#
if {[wait_for_job $job_id "DONE"] != 0} {
	send_user "\nFAILURE: waiting for job to complete\n"
	cancel_job $job_id
	exit 1
}
if {[wait_for_file $file_out] == 0} {
	set matches 0
	set no_nvcc 0
	set run_time1 0
	set run_time2 0
	spawn $bin_cat $file_out
	expect {
		-re "Couldn't allocate memory" {
			send_user "\nWARNING: This means the gpu selected doesn't support this test.\n"
			exit 0
			exp_continue
		}
		-re "No such file" {
			incr no_nvcc
			exp_continue
		}
		-re "Could not find" {
			incr no_nvcc
			exp_continue
		}
		-re "CUDA_VISIBLE_DEVICES" {
			incr matches
			exp_continue
		}
		-re "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE" {
			incr matches
			exp_continue
		}
		-re "Max error: 0" {
			incr matches
			exp_continue
		}
		-re "Run Time (usec): ($number)" {
			if {$run_time1 == 0} {
				set run_time1 $expect_out(1,string)
			} else {
				set run_time2 $expect_out(1,string)
			}
			incr matches
			exp_continue
		}
		eof {
			wait
		}
	}
	log_user 0
	spawn $bin_cat $file_out
	expect {
		-re "Run Time .usec.: ($number)" {
			if {$run_time1 == 0} {
				set run_time1 $expect_out(1,string)
			} else {
				set run_time2 $expect_out(1,string)
			}
			incr matches
			exp_continue
		}
		eof {
			wait
		}
	}
	log_user 1
	if {$no_nvcc != 0} {
		send_user "\nWARNING: Could not find program nvcc (CUDA compiler)\n"
	} elseif {$matches != 6} {
		send_user "\nFAILURE: CUDA output not as expected ($matches != 6)\n"
		set exit_code 1
	}
	if {$run_time1 > 0} {
		set delta_t [expr abs($run_time1 - $run_time2)]
		set percent_time_diff [expr ($delta_t * 100) / $run_time1]
		if {$percent_time_diff > 20} {
			send_user "\nFAILURE: CUDA MPS jobs appear to have not run in parallel\n"
			send_user "FAILURE: Run time difference was $percent_time_diff percent\n"
			set exit_code 1
		} else {
			send_user "\nCUDA MPS jobs do appear to have not run in parallel\n"
			send_user "Run time difference was $percent_time_diff percent\n"
		}
	}
} else {
	set exit_code 1
}

if {$exit_code == 0} {
	exec $bin_rm -f $file_in $file_out $file_prog
	send_user "\nSUCCESS\n"
}
exit $exit_code
